#include <PalmTypes.h>


//this assumes the pointer we are passed is not within 256 bytes of end of ram, this is assured by over-allocation in the 68k code



#define NOT_FOUND	0xFFFFFFFF

void physMemWrite(UInt32 addr, UInt32 value);
UInt32 physMemRead(UInt32 addr);
UInt32 vmV2P(UInt32 topLevelTablePA, void* addr);
UInt32 runAsSupervisor(void* param1, void* param2, void* proc);

struct PxaUart {
	union {
		volatile UInt32 RBR;	//RO
		volatile UInt32 THR;	//WO
		volatile UInt32 DLL;	//when DLAB is on
	};
	union {
		volatile UInt32 IER;
		volatile UInt32 DLH;	//when DLAB is on
	};
	union {
		volatile UInt32 IIR;	//RO
		volatile UInt32 FCR;	//WO;
	};
	
	volatile UInt32 LCR, MCR, LSR, MSR, SPR, ISR;
	
	//HWUART only:
	volatile UInt32 FOR, ABR, ACR;
};

#define LCCR0				0x40000000			//control reg
#define LCSR				0x40000038


asm void kernelMoveAndRun(void* kimg)			//11 instrs, 44 bytes precisely
{
	ldr r1, [r0], #4	//size
	mov r2, #0xa0000000
	add r1, r1, r2		//dst end
loop:
	ldr r3, [r0], #4
	str r3, [r2], #4
	cmp r2, r1
	bne loop
	//we wrote code...
	mov r0, #0
	MCR p15, 0, r0, c7,  c7, 0	//invalidate caches
	MCR p15, 0, r0, c7,  c10, 4	//drain write buffer
	mov pc, #0xa0000000
}

UInt32 loaderArm_stage3(void* image){	//no longer safe t pouse physMemWrite -we are in an uncacheable page using uncacheable stack
	
	UInt32 *topLevelTable = 0;
	UInt32 i, ret_val;
	
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '6';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '6';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//flush mini icache & dcache and disable data cache
	{
		//clean mini
		for (i = 0; i < 8192; i += 32)
			(void)*(volatile UInt32*)(i + 0x08000000);
		
		//clean main
		for (i = 0; i < 65536; i += 32) {
			UInt32 j = i + 0x09000000;
			
			asm(MCR p15,0,j,c7,c2,5)
		}
		
		//inval both
		i = 0;
		asm(MCR P15, 0, i, c7, c6, 0);
		
		//turn off d-caching
		asm(MRC p15,0,i,c1,c0,0);
		i &=~ ((1 << 2) | (1 << 3) | (1 << 12));	//kill write buffer, dcache, icache
		asm(MCR p15,0,i,c1,c0,0);
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '7';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '7';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//find the top-level table
	{
		asm(MRC p15,0,i,c2,c0,0);
		i &=~ 0x1FFF;
		topLevelTable = (void*)i;
	}
	
	//identity map everything
	{
		for(i = 0; i < 0x1000; i++){
			
			topLevelTable[i] = (i << 20) | 0xC02;		//go there
		}
		asm(MCR p15, 0, i, c8,  c7, 0);	//invalidate TLBs
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x41600000;
		uart->THR = '8';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '8';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//now we can disable, clean and invalidate the caches
	{
		asm(
			
			MOV i, #0
			MCR p15, 0, i, c9,  c2, 0	//no longer autolock things into dcache
			MCR p15, 0, i, c9,  c1, 1	//unlock i cache
			MCR p15, 0, i, c9,  c2, 1	//unlock d cache
			MCR p15, 0, i, c7,  c7, 0	//invalidate i cache
			MCR p15, 0, i, c7,  c6, 0	//invalidate d cache
			MCR p15, 0, i, c7,  c10, 4	//drain write buffer
			MCR p15, 0, i, c10, c4, 1	//unlock i-tlb
			MCR p15, 0, i, c10, c8, 1	//unlock d-tlb
			MCR p15, 0, i, c8,  c7, 0	//invalidate TLBs
			
			//CPWAIT
			MRC p15,0,i,c2,c0,0
			MOV i,i
			SUB PC,PC,#4
			NOP
			
			MOV i,#0x0072;				//off: mmu, dcache, write buffer, brnch predict, icache  on: alignment checking
			MCR p15,0,i,c1,c0,0
			
			//CPWAIT:
			MRC p15,0,i,c2,c0,0
			MOV i,i
			SUB PC,PC,#4
			NOP
			
			//CPWAIT
			MRC p15,0,i,c2,c0,0
			MOV i,i
			SUB PC,PC,#4
			NOP
		);
	}
	
	
	{
		struct PxaUart *uart = (struct PxaUart*)0x41600000;
		uart->THR = '9';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '9';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//copy the kernel moving func to end of ram, go to it
	{
		UInt32 *f = (UInt32*)0xa3ffffd0, *s = (UInt32*)&kernelMoveAndRun;
		UInt32 i;
		
		for (i = 0; i < 11; i++)
			f[i] = s[i];
		
		{
			struct PxaUart *uart =  (struct PxaUart*)0x41600000;
			uart->THR = 'a';
			while (!(uart->LSR & 0x40));	//wait for TDRQ
			uart->THR = 'a';
			while (!(uart->LSR & 0x40));	//wait for TDRQ
		}
		
		//we wrote code...
		i = 0;
		asm(MCR p15, 0, i, c7,  c7, 0)	//invalidate caches
		asm(MCR p15, 0, i, c7,  c10, 4)	//drain write buffer
		
		{
			struct PxaUart *uart =  (struct PxaUart*)0x41600000;
			uart->THR = 'b';
			while (!(uart->LSR & 0x40));	//wait for TDRQ
			uart->THR = 'b';
			while (!(uart->LSR & 0x40));	//wait for TDRQ
		}
		((void (*)(void*))f)(image);
	}
}

asm UInt32 loaderArm_stage2(void* image)
{
	LDR sp, =0xa3ffff80		//set up a safe sp
	B   loaderArm_stage3
}

void loaderArm_stage1(int unused, UInt32* image){
	
	UInt32 topLevelTable = 0;
	
	
	
	//init serial post
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
	
		*(volatile UInt32*)0x90e00010 |= (1 << (35 - 32)) | (1 << (48 - 32));
		*(volatile UInt32*)0x90e00010 &=~ (1 << (49 - 32));
		
		*(volatile UInt32*)0x90e0001c = 1 << (35 - 32);
		*(volatile UInt32*)0x90e00060 &=~ 0x0f;
		*(volatile UInt32*)0x90e00060 |= 0x05;
		
		*(volatile UInt32*)0x91300004 |= 1 << 4;
		
		uart->IER = 0x00;	//uart off while we config it
		uart->FCR = 0x00;	//reset fifos, disable
		uart->LCR = 0x80;	//access DLAB
		uart->DLL = 0x08;	//divisor of 8 gives us 115200
		uart->DLH = 0x00;
		uart->LCR = 0x07;	//DLAB off, 8n2 selected
		
		//as this is a HWUART, we need to config a few more things
		uart->ABR = 0x00;	//autobaud off
		
		uart->IER = 0x40;	//unit on, no irqs
		
		uart->THR = '1';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '1';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		
	}
	
	//disable all interrupts and enter supervisor mode
	{
		asm(MSR CPSR_c, #0xD3);		//NOTE: IDA 4.x will mis-dissasemble this instr
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '2';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '2';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '3';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '3';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//find the top-level table
	{
		asm(MRC p15,0,topLevelTable,c2,c0,0);
		topLevelTable = topLevelTable &~ 0x1FFF;
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '4';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '4';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//identity-map all RAM, also map 0x08000000 to 0 for mini dcache flushing, and 0x09000000 to nowhere for dcache cleaning
	{
		UInt32 i;
		
		for (i = 0; i < 64; i++) {
			
			UInt32 addr = 0xa0000000 + (i << 20);
			
			physMemWrite(topLevelTable + ((addr >> 20) << 2), addr | 0xC02);
		}
		physMemWrite(topLevelTable + ((0x08000000 >> 20) << 2), 0x00000000 | 0x1c0a);	//mini data cacheable
		physMemWrite(topLevelTable + ((0x09000000 >> 20) << 2), 0x00100000 | 0x0c0e);	//cacheable & bufferable
		
		//inval TLBs
		i = 0;
		asm(MCR p15, 0, i, c8,  c7, 0);
	}
	
	{
		struct PxaUart *uart =  (struct PxaUart*)0x91600000;
		uart->THR = '5';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
		uart->THR = '5';
		while (!(uart->LSR & 0x40));	//wait for TDRQ
	}
	
	//convert stage2 and image addrs to PAs, go to stage 2 in PA map
	{
		void (*stage2)(void*);
		
		stage2 = (void*)vmV2P(topLevelTable, &loaderArm_stage2);
		image = (void*)vmV2P(topLevelTable, image);
		
		stage2(image);
	}
}

////////////////////////   C utility code   /////////////////////////


UInt32 vmV2P(UInt32 topLevelTablePA, void* virt_addr)
{
	
	UInt32 addr = (UInt32)virt_addr;
	UInt32 tmp;
	
	//read first level table entry
	tmp = physMemRead(topLevelTablePA + ((addr >> 20) << 2));
	
	
	//process first level table entry
	switch(tmp & 3){
		
		case 0:		//invalid addr
			
			return NOT_FOUND;
			break;
		
		case 1:		//coarse 2nd-level pagetable follows
			
			tmp = (tmp & ~ 0x3FF) | (((addr >> 12) & 0xFF) << 2);
			break;
		
		case 2:		//section
			
			return (addr & 0x000FFFFF) | (tmp & 0xFFF00000);
			break;
		
		case 3:		//fine 2nd-level pagetable follows
			
			tmp  = (tmp & ~ 0xFFF) | (((addr >> 10) & 0x3FF) << 2);
			break;
	}
	
	
	//read second level table entry
	tmp = physMemRead(tmp);
	
	
	//process second level table entry
	switch(tmp & 3){
		
		case 0:		//invalid addr
			
			return NOT_FOUND;
			break;
		
		case 1:		//large page
			
			return (tmp & 0xFFFF0000) | (addr & 0x0000FFFF);
			break;
		
		case 2:		//small page
			
			return (tmp & 0xFFFFF000) | (addr & 0x00000FFF);
			break;
		
		case 3:		//tiny page
			
			return (tmp & 0xFFFFFC00) | (addr & 0x000003FF);
			break;
	}
}

///////////////////////   ASM utility code   ////////////////////////

asm void physMemWrite(UInt32 addr, UInt32 value){
	mov r2, r0
	mov r12,r1

	mrs	r1, cpsr
	orr	r0, r1, #0x80
	msr	cpsr_c, r0				// turn irq off

	mov r0,#0
l1:
	cmp r0,#1

	mrceq p15, 0, r3, c1, c0, 0
	biceq r0, r3, #0x0009
	mcreq p15, 0, r0, c1, c0, 0 // turn MMU off

	mrceq p15, 0, r0, c2, c0, 0	// wait for action to take effect
	mrceq p15, 0, r0, c2, c0, 0	// wait for action to take effect
	streq r12,[r2]

	mcreq p15, 0, r3, c1, c0, 0	// restore MMU

	movne r0,#1
	bne l1

	msr	cpsr_c, r1				// restore irq

	bx  lr
}

asm UInt32 physMemRead(UInt32 addr){
	mov r2, r0

	mrs	r1, cpsr
	orr	r0, r1, #0x80
	msr	cpsr_c, r0				// turn irq off

	mov r0,#0
l1:
	cmp r0,#1

	mrceq p15, 0, r3, c1, c0, 0
	biceq r0, r3, #0x0009
	mcreq p15, 0, r0, c1, c0, 0 // turn MMU off

	mrceq p15, 0, r0, c2, c0, 0	// wait for action to take effect
	mrceq p15, 0, r0, c2, c0, 0	// wait for action to take effect
	ldreq r0,[r2]

	mcreq p15, 0, r3, c1, c0, 0	// restore MMU

	movne r0,#1
	bne l1

	msr	cpsr_c, r1				// restore irq

	bx  lr
}

asm UInt32 readphy(UInt32 addr){
	mov r2, r0

	mrs	r1, cpsr
	orr	r0, r1, #0x80
	msr	cpsr_c, r0				//@ turn irq off

	mov r0,#0
l1:
	cmp r0,#1

	mrceq p15, 0, r3, c1, c0, 0
	biceq r0, r3, #0x0009
	mcreq p15, 0, r0, c1, c0, 0 //@ turn MMU off

	mrceq p15, 0, r0, c2, c0, 0	//@ wait for action to take effect
	mrceq p15, 0, r0, c2, c0, 0	//@ wait for action to take effect
	ldreq r0,[r2]

	mcreq p15, 0, r3, c1, c0, 0	//@ restore MMU

	movne r0,#1
	bne l1

	msr	cpsr_c, r1				//@ restore irq

	mov	pc,lr
}

asm void __ARMlet_Startup__(void* crap1, UInt32* image, void* crap2)
{
	STMFD SP!, {R4, R5, R10, LR}
	SUB   R10,PC,#12
	MOV   R4, SP
	MSR   CPSR_c, #0x13	//go SVC
	MOV   R5, SP
	MOV   SP, R4
	BL    loaderArm_stage1
	MOV   SP, R5
	MSR   CPSR_c, #0x1F	//return to SYS
	LDMFD SP!, {R4, R5, R10, PC}
}